Code
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)
library(hexbin)
library(ggExtra)
library(ggrepel)
library(truncnorm)Libraries
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)
library(hexbin)
library(ggExtra)
library(ggrepel)
library(truncnorm)Paths
metadata_path <-
"data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
depth_by_chrom_good_desj_path <-
"../Crypto_Desjardins/results/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
depth_by_chrom_good_ashton_path <-
"../Crypto_Ashton/results/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
cnv_chroms_path <-
"../Crypto_Desjardins_Ashton/results/02.Dataset/cnv/cnv_chromosomes.tsv"
chrom_metrics_out_path <-
"results/tables/chrom_metrics.tsv"
ploidy_path <- "results/tables/ploidy_from_plots.tsv"Use the metadata table that has all the samples included in the final Crypto_Desjardins_Ashton dataset (n = 1055) and H99.
Select needed columns, remove H99 and get the number of samples per dataset and lineage, per lineage, and total.
metadata <- read.delim(
metadata_path,
header=TRUE,
sep=",",
stringsAsFactor = TRUE)
metadata <- metadata %>%
select(sample, strain, source, lineage, dataset, vni_subdivision)%>%
filter(!strain == "H99") %>%
group_by(dataset, lineage)%>%
mutate(samples_in_dataset_lineage = n_distinct(sample))%>%
ungroup() %>%
group_by(lineage)%>%
mutate(samples_in_lineage = n_distinct(sample))%>%
ungroup()%>%
mutate(total_samples = n_distinct(sample))%>%
droplevels()ploidy <- read.delim(
ploidy_path,
header = TRUE,
sep = "\t",
stringsAsFactors = FALSE)%>%
select(sample, strain, lineage,
accession, chromosome, ploidy, type, gappy,
distribution_pattern, fraction_duplications,
steps, smile, uniform)%>%
distinct()
ploidy$ploidy <- factor(ploidy$ploidy, levels = c("haploid", "diploid", "triploid", "tetraploid"))
ploidy$type <- factor(ploidy$type, levels = c("full", "large", "partial"))depth_by_chrom_good_desjardins <- read.delim(
depth_by_chrom_good_desj_path,
header=TRUE, sep="\t")
depth_by_chrom_good_ashton <- read.delim(
depth_by_chrom_good_ashton_path,
header=TRUE, sep="\t")
depth_by_chrom <- rbind(depth_by_chrom_good_desjardins, depth_by_chrom_good_ashton)%>%
select(sample, accession, norm_chrom_mean, norm_chrom_median)cnv_chroms <- read.delim(
cnv_chroms_path,
header = TRUE,
sep = "\t")%>%
mutate(chromosome = str_pad(chromosome, 2, pad = "0"))
cnv_chroms$chromosome <- as.factor(cnv_chroms$chromosome)
levels(cnv_chroms$chromosome) <- paste("chr", levels(cnv_chroms$chromosome), sep="")Calculate
* percent_largest_region : Percentage of length of chromosome covered by the largest CNV region of the chormosome.
* diff_span_cov: Difference between span_percent and coverage_percent.
* n_regions_100kb: Number of regions per 100kb
cnv_chroms <- cnv_chroms %>%
mutate(percent_largest_region = (size_largest_region / length)*100,
diff_span_cov = abs(span_percent - coverage_percent),
n_regions_100kb = (n_regions / length) * 100000)Pivot table to get only one line per chromosome.
Join with depth per chromopsome and with ploidy informations.
chrom_metrics <- cnv_chroms %>%
pivot_wider(names_from = cnv,
values_from = c("n_regions","total_size_regions","coverage_percent",
"span_percent","diff_span_cov","size_smallest_region","size_largest_region",
"percent_largest_region", "n_regions_100kb",
"std_regions_size","norm_depth_mean","norm_depth_median",
"smooth_depth_mean","smooth_depth_median")) %>%
left_join(depth_by_chrom, by = c("sample", "accession"))%>%
left_join(ploidy, by = c("sample", "accession", "lineage", "chromosome"))Simulation of haploid chromosomes with all proportions (in steps of 5) of percentage of chromosome covered by deletions, duplications and single-copy regions.
Select simulation of percentage of deletion = 5.
n <- 100000
sd_v <- 0.2
p <- data.frame(p_del = c(rep(seq(0,100, by = 5), each = 100)),
p_sc = c(rep(seq(0,100, by = 5), 100)))%>%
mutate(sum = p_del + p_sc)%>%
filter(sum <= 100)%>%
distinct()
sample_summary <- p %>%
mutate(p_dup = 100 - p_del - p_sc,
n_del = p_del * n / 100,
n_sc = p_sc * n / 100,
n_dup = p_dup * n / 100)%>%
rowwise()%>%
mutate(median_depth = median(c(rtruncnorm(n = n_del, a = 0, mean = 0, sd = sd_v),
rnorm(n_sc, mean = 1, sd = sd_v),
rnorm(n_dup, mean = 2, sd = sd_v))),
mean_depth = mean(c(rtruncnorm(n = n_del, a = 0, mean = 0, sd = sd_v),
rnorm(n_sc, mean = 1, sd = sd_v),
rnorm(n_dup, mean = 2, sd = sd_v))))%>%
select(p_del,p_sc,p_dup, median_depth, mean_depth)%>%
mutate(category = paste(p_del,p_sc,p_dup, sep = "_"),
dup_filter = ifelse(p_dup >= 50, "half_duplication", "small_duplication"),
del_filter = ifelse(p_del >= 50, "half_deletion", "small_deletion"))
trend <- filter(sample_summary, p_del == 5)ggplot() + theme_minimal()
p <- ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point()+
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "vertical")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications")
p1 <- ggMarginal(
p,
type = "boxplot",
margins = "both",
size = 5,
groupColour = FALSE,
groupFill = FALSE
)
p1To create a version of this notebook with and without the filter This version is not filtered
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(alpha = 0.1)+
geom_point(aes(color = type, shape = ploidy))+
scale_color_brewer(name = "Type", palette = "Set1") +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications",
shape = "Ploidy")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median))+
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_point(aes(color = coverage_percent_deletion))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = coverage_percent_deletion))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = coverage_percent_deletion))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median))+
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_point(aes(color = n_regions_100kb_duplication))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = n_regions_100kb_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = n_regions_100kb_duplication))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_point(aes(color = span_percent_duplication)) +
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = span_percent_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = span_percent_duplication))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_point(aes(color = diff_span_cov_duplication)) +
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = diff_span_cov_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = diff_span_cov_duplication))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")max_color_range <- paste("2.7 -", max(chrom_metrics$norm_depth_median_duplication, na.rm = TRUE))
colors <- c("#FDE725FF","#95D840FF","#29AF7FFF","#287D8EFF","#440154FF" )
names(colors) <-c("0 - 1.5", "1.6 - 2", "2.1 - 2.3", "2.4 - 2.6", max_color_range)
ggplot(filter(chrom_metrics, smooth_depth_median_duplication < 4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(aes(color = cut(norm_depth_median_duplication,
breaks = c(-Inf, 1.5, 2, 2.3, 2.6, Inf),
labels = names(colors))),
alpha = 1) +
scale_color_manual(values = colors,
name = "Median Depth of dupCNV\nWindows in Chromosome")+
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_depth_median_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = norm_depth_median_duplication))+
geom_abline(intercept = 0, slope = 1, color = "gray", linetype = "solid")+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_depth_median_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
scale_y_continuous(limits = c(0,5))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = norm_depth_median_duplication))+
geom_abline(intercept = 0, slope = 1, color = "gray", linetype = "solid")+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,5))+
theme_minimal() +
theme(legend.position = "bottom")colors <- c("#FDE725FF","#95D840FF","#29AF7FFF","#287D8EFF","#440154FF" )
names(colors) <-c("0 - 1.5", "1.6 - 2", "2.1 - 2.3", "2.4 - 2.6", "2.7 - 4")
ggplot(filter(chrom_metrics, smooth_depth_median_duplication < 4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(aes(color = cut(smooth_depth_median_duplication,
breaks = c(-Inf, 1.5, 2, 2.3, 2.6, Inf),
labels = names(colors))),
alpha = 1) +
scale_color_manual(values = colors,
name = "Smooth Median Depth of dupCNV\nWindows in Chromosome")+
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = smooth_depth_median_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = smooth_depth_median_duplication))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = smooth_depth_median_duplication))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
scale_y_continuous(limits = c(0,5))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = smooth_depth_median_duplication))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,5))+
theme_minimal() +
theme(legend.position = "bottom")colors <- c("#FDE725FF","#95D840FF","#29AF7FFF","#287D8EFF","#440154FF" )
names(colors) <-c("0 - 5", "6 - 10", "11 -15", "16 - 20", "21 - 100")
ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
# geom_point(aes(color = percent_largest_region_single_copy))+
# scale_color_viridis_c(direction = -1)+
geom_point(aes(color = cut(percent_largest_region_single_copy,
breaks = c(0,5,10,15,20,100),
labels = names(colors)))) +
scale_color_manual(values = colors,
name = "Largest Single-Copy Region")+
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "vertical")ggplot(chrom_metrics, aes(x = coverage_percent_duplication, y = percent_largest_region_single_copy))+
geom_point(aes(color = norm_chrom_median))+
scale_color_viridis_c(direction = -1) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = percent_largest_region_single_copy))+
geom_point(aes(color = coverage_percent_duplication))+
scale_color_viridis_c(direction = -1) +
theme_minimal() +
theme(legend.position = "bottom")ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(alpha = 0.5, aes(color = lineage))+
geom_text_repel(aes(label = sample), size = 3, max.overlaps = 10) +
scale_x_continuous(limits = c(0,100))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications",
shape = "Ploidy")Names, lower left
ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(alpha = 0.5, aes(color = lineage))+
geom_text_repel(aes(label = paste(sample, chromosome)), size = 3, max.overlaps = 10) +
scale_x_continuous(limits = c(0,50))+
scale_y_continuous(limits = c(1.5, 2.25))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications",
shape = "Ploidy")Names, middle right
ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(alpha = 0.5, aes(color = lineage))+
geom_text_repel(aes(label = paste(sample, chromosome)), size = 3, max.overlaps = 10) +
scale_x_continuous(limits = c(50,100))+
scale_y_continuous(limits = c(1.5,2))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications",
shape = "Ploidy")Names, upper right
ggplot(filter(chrom_metrics, coverage_percent_duplication >= 20 | norm_chrom_median >= 1.4),
aes(x = coverage_percent_duplication, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "solid") +
geom_line(data = trend, aes(x= p_dup, y = median_depth), color = "gray")+ # Line of simulated data
geom_point(alpha = 0.5, aes(color = lineage))+
geom_text_repel(aes(label = paste(sample, chromosome)), size = 3, max.overlaps = 20) +
scale_x_continuous(limits = c(60,100))+
scale_y_continuous(limits = c(2,2.75))+
theme_minimal() +
theme(legend.position = "bottom", legend.direction = "horizontal")+
labs(y = "Normalized Chromosome Median",
x = "Percent of Chromosome Covered by Duplications",
shape = "Ploidy")